# Copyright (c) 2012, Lambda Foundry, Inc.
# See LICENSE for the license
import bz2
from csv import QUOTE_MINIMAL, QUOTE_NONE, QUOTE_NONNUMERIC
from errno import ENOENT
import gzip
import io
import os
import sys
import time
import warnings
import zipfile

from libc.stdlib cimport free
from libc.string cimport strcasecmp, strlen, strncpy

import cython
from cython import Py_ssize_t

from cpython.bytes cimport PyBytes_AsString, PyBytes_FromString
from cpython.exc cimport PyErr_Fetch, PyErr_Occurred
from cpython.object cimport PyObject
from cpython.ref cimport Py_XDECREF
from cpython.unicode cimport PyUnicode_AsUTF8String, PyUnicode_Decode


cdef extern from "Python.h":
    object PyUnicode_FromString(char *v)


import numpy as np

cimport numpy as cnp
from numpy cimport float64_t, int64_t, ndarray, uint8_t, uint64_t

cnp.import_array()

from pandas._libs cimport util
from pandas._libs.util cimport INT64_MAX, INT64_MIN, UINT64_MAX

import pandas._libs.lib as lib

from pandas._libs.khash cimport (
    kh_destroy_float64,
    kh_destroy_str,
    kh_destroy_str_starts,
    kh_destroy_strbox,
    kh_exist_str,
    kh_float64_t,
    kh_get_float64,
    kh_get_str,
    kh_get_str_starts_item,
    kh_get_strbox,
    kh_init_float64,
    kh_init_str,
    kh_init_str_starts,
    kh_init_strbox,
    kh_put_float64,
    kh_put_str,
    kh_put_str_starts_item,
    kh_put_strbox,
    kh_resize_float64,
    kh_resize_str_starts,
    kh_str_starts_t,
    kh_str_t,
    kh_strbox_t,
    khiter_t,
)

from pandas.compat import get_lzma_file, import_lzma
from pandas.errors import DtypeWarning, EmptyDataError, ParserError, ParserWarning

from pandas.core.dtypes.common import (
    is_bool_dtype,
    is_categorical_dtype,
    is_datetime64_dtype,
    is_extension_array_dtype,
    is_float_dtype,
    is_integer_dtype,
    is_object_dtype,
    pandas_dtype,
)
from pandas.core.dtypes.concat import union_categoricals

lzma = import_lzma()

cdef:
    float64_t INF = <float64_t>np.inf
    float64_t NEGINF = -INF


cdef extern from "headers/portable.h":
    # I *think* this is here so that strcasecmp is defined on Windows
    # so we don't get
    # `parsers.obj : error LNK2001: unresolved external symbol strcasecmp`
    # in Appveyor.
    # In a sane world, the `from libc.string cimport` above would fail
    # loudly.
    pass


cdef extern from "parser/tokenizer.h":

    ctypedef enum ParserState:
        START_RECORD
        START_FIELD
        ESCAPED_CHAR
        IN_FIELD
        IN_QUOTED_FIELD
        ESCAPE_IN_QUOTED_FIELD
        QUOTE_IN_QUOTED_FIELD
        EAT_CRNL
        EAT_CRNL_NOP
        EAT_WHITESPACE
        EAT_COMMENT
        EAT_LINE_COMMENT
        WHITESPACE_LINE
        SKIP_LINE
        FINISHED

    enum: ERROR_OVERFLOW

    ctypedef void* (*io_callback)(void *src, size_t nbytes, size_t *bytes_read,
                                  int *status)
    ctypedef int (*io_cleanup)(void *src)

    ctypedef struct parser_t:
        void *source
        io_callback cb_io
        io_cleanup cb_cleanup

        int64_t chunksize  # Number of bytes to prepare for each chunk
        char *data         # pointer to data to be processed
        int64_t datalen    # amount of data available
        int64_t datapos

        # where to write out tokenized data
        char *stream
        uint64_t stream_len
        uint64_t stream_cap

        # Store words in (potentially ragged) matrix for now, hmm
        char **words
        int64_t *word_starts  # where we are in the stream
        uint64_t words_len
        uint64_t words_cap
        uint64_t max_words_cap   # maximum word cap encountered

        char *pword_start        # pointer to stream start of current field
        int64_t word_start       # position start of current field

        int64_t *line_start      # position in words for start of line
        int64_t *line_fields     # Number of fields in each line
        uint64_t lines           # Number of lines observed
        uint64_t file_lines      # Number of lines observed (with bad/skipped)
        uint64_t lines_cap       # Vector capacity

        # Tokenizing stuff
        ParserState state
        int doublequote            # is " represented by ""? */
        char delimiter             # field separator */
        int delim_whitespace       # consume tabs / spaces instead
        char quotechar             # quote character */
        char escapechar            # escape character */
        char lineterminator
        int skipinitialspace       # ignore spaces following delimiter? */
        int quoting                # style of quoting to write */

        char commentchar
        int allow_embedded_newline
        int strict                 # raise exception on bad CSV */

        int usecols

        int expected_fields
        int error_bad_lines
        int warn_bad_lines

        # floating point options
        char decimal
        char sci

        # thousands separator (comma, period)
        char thousands

        int header                  # Boolean: 1: has header, 0: no header
        int64_t header_start        # header row start
        uint64_t header_end         # header row end

        void *skipset
        PyObject *skipfunc
        int64_t skip_first_N_rows
        int64_t skipfooter
        # pick one, depending on whether the converter requires GIL
        float64_t (*double_converter)(const char *, char **,
                                      char, char, char,
                                      int, int *, int *) nogil

        #  error handling
        char *warn_msg
        char *error_msg

        int64_t skip_empty_lines

    ctypedef struct coliter_t:
        char **words
        int64_t *line_start
        int64_t col

    ctypedef struct uint_state:
        int seen_sint
        int seen_uint
        int seen_null

    void uint_state_init(uint_state *self)
    int uint64_conflict(uint_state *self)

    void coliter_setup(coliter_t *it, parser_t *parser,
                       int64_t i, int64_t start) nogil
    void COLITER_NEXT(coliter_t, const char *) nogil

    parser_t* parser_new()

    int parser_init(parser_t *self) nogil
    void parser_free(parser_t *self) nogil
    void parser_del(parser_t *self) nogil
    int parser_add_skiprow(parser_t *self, int64_t row)

    int parser_set_skipfirstnrows(parser_t *self, int64_t nrows)

    void parser_set_default_options(parser_t *self)

    int parser_consume_rows(parser_t *self, size_t nrows)

    int parser_trim_buffers(parser_t *self)

    int tokenize_all_rows(parser_t *self) nogil
    int tokenize_nrows(parser_t *self, size_t nrows) nogil

    int64_t str_to_int64(char *p_item, int64_t int_min,
                         int64_t int_max, int *error, char tsep) nogil
    uint64_t str_to_uint64(uint_state *state, char *p_item, int64_t int_max,
                           uint64_t uint_max, int *error, char tsep) nogil

    float64_t xstrtod(const char *p, char **q, char decimal,
                      char sci, char tsep, int skip_trailing,
                      int *error, int *maybe_int) nogil
    float64_t precise_xstrtod(const char *p, char **q, char decimal,
                              char sci, char tsep, int skip_trailing,
                              int *error, int *maybe_int) nogil
    float64_t round_trip(const char *p, char **q, char decimal,
                         char sci, char tsep, int skip_trailing,
                         int *error, int *maybe_int) nogil

    int to_boolean(const char *item, uint8_t *val) nogil


cdef extern from "parser/io.h":
    void *new_mmap(char *fname)
    int del_mmap(void *src)
    void* buffer_mmap_bytes(void *source, size_t nbytes,
                            size_t *bytes_read, int *status)

    void *new_file_source(char *fname, size_t buffer_size) except NULL

    void *new_rd_source(object obj) except NULL

    int del_file_source(void *src)
    int del_rd_source(void *src)

    void* buffer_file_bytes(void *source, size_t nbytes,
                            size_t *bytes_read, int *status)

    void* buffer_rd_bytes(void *source, size_t nbytes,
                          size_t *bytes_read, int *status)


DEFAULT_CHUNKSIZE = 256 * 1024


cdef class TextReader:
    """

    # source: StringIO or file object

    """

    cdef:
        parser_t *parser
        object na_fvalues
        object true_values, false_values
        object handle
        bint na_filter, keep_default_na, verbose, has_usecols, has_mi_columns
        uint64_t parser_start
        list clocks
        char *c_encoding
        kh_str_starts_t *false_set
        kh_str_starts_t *true_set

    cdef public:
        int64_t leading_cols, table_width, skipfooter, buffer_lines
        bint allow_leading_cols, mangle_dupe_cols, memory_map, low_memory
        bint delim_whitespace
        object delimiter, converters
        object na_values
        object header, orig_header, names, header_start, header_end
        object index_col
        object skiprows
        object dtype
        object encoding
        object compression
        object usecols
        list dtype_cast_order
        set unnamed_cols
        set noconvert

    def __cinit__(self, source,
                  delimiter=b',',
                  header=0,
                  header_start=0,
                  header_end=0,
                  index_col=None,
                  names=None,
                  bint memory_map=False,
                  tokenize_chunksize=DEFAULT_CHUNKSIZE,
                  bint delim_whitespace=False,
                  compression=None,
                  converters=None,
                  bint skipinitialspace=False,
                  escapechar=None,
                  bint doublequote=True,
                  quotechar=b'"',
                  quoting=0,
                  lineterminator=None,
                  encoding=None,
                  comment=None,
                  decimal=b'.',
                  thousands=None,
                  dtype=None,
                  usecols=None,
                  bint error_bad_lines=True,
                  bint warn_bad_lines=True,
                  bint na_filter=True,
                  na_values=None,
                  na_fvalues=None,
                  bint keep_default_na=True,
                  true_values=None,
                  false_values=None,
                  bint allow_leading_cols=True,
                  bint low_memory=False,
                  skiprows=None,
                  skipfooter=0,
                  bint verbose=False,
                  bint mangle_dupe_cols=True,
                  float_precision=None,
                  bint skip_blank_lines=True):

        # set encoding for native Python and C library
        if encoding is not None:
            if not isinstance(encoding, bytes):
                encoding = encoding.encode('utf-8')
            encoding = encoding.lower()
            self.c_encoding = <char*>encoding
        else:
            self.c_encoding = NULL

        self.encoding = encoding

        self.parser = parser_new()
        self.parser.chunksize = tokenize_chunksize

        self.mangle_dupe_cols = mangle_dupe_cols

        # For timekeeping
        self.clocks = []

        self.compression = compression
        self.memory_map = memory_map

        self.parser.usecols = (usecols is not None)

        self._setup_parser_source(source)
        parser_set_default_options(self.parser)

        parser_init(self.parser)

        if delim_whitespace:
            self.parser.delim_whitespace = delim_whitespace
        else:
            if len(delimiter) > 1:
                raise ValueError('only length-1 separators excluded right now')
            self.parser.delimiter = ord(delimiter)

        # ----------------------------------------
        # parser options

        self.parser.doublequote = doublequote
        self.parser.skipinitialspace = skipinitialspace
        self.parser.skip_empty_lines = skip_blank_lines

        if lineterminator is not None:
            if len(lineterminator) != 1:
                raise ValueError('Only length-1 line terminators supported')
            self.parser.lineterminator = ord(lineterminator)

        if len(decimal) != 1:
            raise ValueError('Only length-1 decimal markers supported')
        self.parser.decimal = ord(decimal)

        if thousands is not None:
            if len(thousands) != 1:
                raise ValueError('Only length-1 thousands markers supported')
            self.parser.thousands = ord(thousands)

        if escapechar is not None:
            if len(escapechar) != 1:
                raise ValueError('Only length-1 escapes supported')
            self.parser.escapechar = ord(escapechar)

        self._set_quoting(quotechar, quoting)

        dtype_order = ['int64', 'float64', 'bool', 'object']
        if quoting == QUOTE_NONNUMERIC:
            # consistent with csv module semantics, cast all to float
            dtype_order = dtype_order[1:]
        self.dtype_cast_order = [np.dtype(x) for x in dtype_order]

        if comment is not None:
            if len(comment) > 1:
                raise ValueError('Only length-1 comment characters supported')
            self.parser.commentchar = ord(comment)

        # error handling of bad lines
        self.parser.error_bad_lines = int(error_bad_lines)
        self.parser.warn_bad_lines = int(warn_bad_lines)

        self.skiprows = skiprows
        if skiprows is not None:
            self._make_skiprow_set()

        self.skipfooter = skipfooter

        # suboptimal
        if usecols is not None:
            self.has_usecols = 1
            # GH-20558, validate usecols at higher level and only pass clean
            # usecols into TextReader.
            self.usecols = usecols

        # XXX
        if skipfooter > 0:
            self.parser.error_bad_lines = 0
            self.parser.warn_bad_lines = 0

        self.delimiter = delimiter
        self.delim_whitespace = delim_whitespace

        self.na_values = na_values
        if na_fvalues is None:
            na_fvalues = set()
        self.na_fvalues = na_fvalues

        self.true_values = _maybe_encode(true_values) + _true_values
        self.false_values = _maybe_encode(false_values) + _false_values

        self.true_set = kset_from_list(self.true_values)
        self.false_set = kset_from_list(self.false_values)

        self.keep_default_na = keep_default_na
        self.converters = converters
        self.na_filter = na_filter

        self.verbose = verbose
        self.low_memory = low_memory

        if float_precision == "round_trip":
            # see gh-15140
            self.parser.double_converter = round_trip
        elif float_precision == "legacy":
            self.parser.double_converter = xstrtod
        elif float_precision == "high" or float_precision is None:
            self.parser.double_converter = precise_xstrtod
        else:
            raise ValueError(f'Unrecognized float_precision option: '
                             f'{float_precision}')

        if isinstance(dtype, dict):
            dtype = {k: pandas_dtype(dtype[k])
                     for k in dtype}
        elif dtype is not None:
            dtype = pandas_dtype(dtype)

        self.dtype = dtype

        # XXX
        self.noconvert = set()

        self.index_col = index_col

        # ----------------------------------------
        # header stuff

        self.allow_leading_cols = allow_leading_cols
        self.leading_cols = 0

        # TODO: no header vs. header is not the first row
        self.has_mi_columns = 0
        self.orig_header = header
        if header is None:
            # sentinel value
            self.parser.header_start = -1
            self.parser.header_end = -1
            self.parser.header = -1
            self.parser_start = 0
            self.header = []
        else:
            if isinstance(header, list):
                if len(header) > 1:
                    # need to artificially skip the final line
                    # which is still a header line
                    header = list(header)
                    header.append(header[-1] + 1)
                    self.parser.header_end = header[-1]
                    self.has_mi_columns = 1
                else:
                    self.parser.header_end = header[0]

                self.parser_start = header[-1] + 1
                self.parser.header_start = header[0]
                self.parser.header = header[0]
                self.header = header
            else:
                self.parser.header_start = header
                self.parser.header_end = header
                self.parser_start = header + 1
                self.parser.header = header
                self.header = [ header ]

        self.names = names
        self.header, self.table_width, self.unnamed_cols = self._get_header()

        if not self.table_width:
            raise EmptyDataError("No columns to parse from file")

        # Compute buffer_lines as function of table width.
        heuristic = 2**20 // self.table_width
        self.buffer_lines = 1
        while self.buffer_lines * 2 < heuristic:
            self.buffer_lines *= 2

    def __init__(self, *args, **kwargs):
        pass

    def __dealloc__(self):
        parser_free(self.parser)
        if self.true_set:
            kh_destroy_str_starts(self.true_set)
            self.true_set = NULL
        if self.false_set:
            kh_destroy_str_starts(self.false_set)
            self.false_set = NULL
        parser_del(self.parser)

    def close(self):
        # we need to properly close an open derived
        # filehandle here, e.g. and UTFRecoder
        if self.handle is not None:
            self.handle.close()

        # also preemptively free all allocated memory
        parser_free(self.parser)
        if self.true_set:
            kh_destroy_str_starts(self.true_set)
            self.true_set = NULL
        if self.false_set:
            kh_destroy_str_starts(self.false_set)
            self.false_set = NULL

    def set_error_bad_lines(self, int status):
        self.parser.error_bad_lines = status

    def _set_quoting(self, quote_char, quoting):
        if not isinstance(quoting, int):
            raise TypeError('"quoting" must be an integer')

        if not QUOTE_MINIMAL <= quoting <= QUOTE_NONE:
            raise TypeError('bad "quoting" value')

        if not isinstance(quote_char, (str, bytes)) and quote_char is not None:
            dtype = type(quote_char).__name__
            raise TypeError(f'"quotechar" must be string, not {dtype}')

        if quote_char is None or quote_char == '':
            if quoting != QUOTE_NONE:
                raise TypeError("quotechar must be set if quoting enabled")
            self.parser.quoting = quoting
            self.parser.quotechar = -1
        elif len(quote_char) > 1:  # 0-len case handled earlier
            raise TypeError('"quotechar" must be a 1-character string')
        else:
            self.parser.quoting = quoting
            self.parser.quotechar = ord(quote_char)

    cdef _make_skiprow_set(self):
        if util.is_integer_object(self.skiprows):
            parser_set_skipfirstnrows(self.parser, self.skiprows)
        elif not callable(self.skiprows):
            for i in self.skiprows:
                parser_add_skiprow(self.parser, i)
        else:
            self.parser.skipfunc = <PyObject *>self.skiprows

    cdef _setup_parser_source(self, source):
        cdef:
            void *ptr

        self.parser.cb_io = NULL
        self.parser.cb_cleanup = NULL

        if self.compression:
            if self.compression == 'gzip':
                if isinstance(source, str):
                    source = gzip.GzipFile(source, 'rb')
                else:
                    source = gzip.GzipFile(fileobj=source)
            elif self.compression == 'bz2':
                source = bz2.BZ2File(source, 'rb')
            elif self.compression == 'zip':
                zip_file = zipfile.ZipFile(source)
                zip_names = zip_file.namelist()

                if len(zip_names) == 1:
                    file_name = zip_names.pop()
                    source = zip_file.open(file_name)

                elif len(zip_names) == 0:
                    raise ValueError(f'Zero files found in compressed '
                                     f'zip file {source}')
                else:
                    raise ValueError(f'Multiple files found in compressed '
                                     f'zip file {zip_names}')
            elif self.compression == 'xz':
                if isinstance(source, str):
                    source = get_lzma_file(lzma)(source, 'rb')
                else:
                    source = get_lzma_file(lzma)(filename=source)
            else:
                raise ValueError(f'Unrecognized compression type: '
                                 f'{self.compression}')

            if (self.encoding and hasattr(source, "read") and
                    not hasattr(source, "encoding")):
                source = io.TextIOWrapper(
                    source, self.encoding.decode('utf-8'), newline='')

                self.encoding = b'utf-8'
                self.c_encoding = <char*>self.encoding

            self.handle = source

        if isinstance(source, str):
            encoding = sys.getfilesystemencoding() or "utf-8"
            usource = source
            source = source.encode(encoding)

            if self.memory_map:
                ptr = new_mmap(source)
                if ptr == NULL:
                    # fall back
                    ptr = new_file_source(source, self.parser.chunksize)
                    self.parser.cb_io = &buffer_file_bytes
                    self.parser.cb_cleanup = &del_file_source
                else:
                    self.parser.cb_io = &buffer_mmap_bytes
                    self.parser.cb_cleanup = &del_mmap
            else:
                ptr = new_file_source(source, self.parser.chunksize)
                self.parser.cb_io = &buffer_file_bytes
                self.parser.cb_cleanup = &del_file_source
            self.parser.source = ptr

        elif hasattr(source, 'read'):
            # e.g., StringIO

            ptr = new_rd_source(source)
            self.parser.source = ptr
            self.parser.cb_io = &buffer_rd_bytes
            self.parser.cb_cleanup = &del_rd_source
        else:
            raise IOError(f'Expected file path name or file-like object, '
                          f'got {type(source)} type')

    cdef _get_header(self):
        # header is now a list of lists, so field_count should use header[0]

        cdef:
            Py_ssize_t i, start, field_count, passed_count, unnamed_count, level
            char *word
            object name, old_name
            uint64_t hr, data_line = 0
            char *errors = "strict"
            StringPath path = _string_path(self.c_encoding)
            list header = []
            set unnamed_cols = set()

        if self.parser.header_start >= 0:

            # Header is in the file
            for level, hr in enumerate(self.header):

                this_header = []

                if self.parser.lines < hr + 1:
                    self._tokenize_rows(hr + 2)

                if self.parser.lines == 0:
                    field_count = 0
                    start = self.parser.line_start[0]

                # e.g., if header=3 and file only has 2 lines
                elif (self.parser.lines < hr + 1
                      and not isinstance(self.orig_header, list)) or (
                          self.parser.lines < hr):
                    msg = self.orig_header
                    if isinstance(msg, list):
                        joined = ','.join(str(m) for m in msg)
                        msg = f"[{joined}], len of {len(msg)},"
                    raise ParserError(
                        f'Passed header={msg} but only '
                        f'{self.parser.lines} lines in file')

                else:
                    field_count = self.parser.line_fields[hr]
                    start = self.parser.line_start[hr]

                counts = {}
                unnamed_count = 0

                for i in range(field_count):
                    word = self.parser.words[start + i]

                    if path == UTF8:
                        name = PyUnicode_FromString(word)
                    elif path == ENCODED:
                        name = PyUnicode_Decode(word, strlen(word),
                                                self.c_encoding, errors)

                    # We use this later when collecting placeholder names.
                    old_name = name

                    if name == '':
                        if self.has_mi_columns:
                            name = f'Unnamed: {i}_level_{level}'
                        else:
                            name = f'Unnamed: {i}'
                        unnamed_count += 1

                    count = counts.get(name, 0)

                    if not self.has_mi_columns and self.mangle_dupe_cols:
                        while count > 0:
                            counts[name] = count + 1
                            name = f'{name}.{count}'
                            count = counts.get(name, 0)

                    if old_name == '':
                        unnamed_cols.add(name)

                    this_header.append(name)
                    counts[name] = count + 1

                if self.has_mi_columns:

                    # If we have grabbed an extra line, but it's not in our
                    # format, save in the buffer, and create an blank extra
                    # line for the rest of the parsing code.
                    if hr == self.header[-1]:
                        lc = len(this_header)
                        ic = (len(self.index_col) if self.index_col
                              is not None else 0)

                        if lc != unnamed_count and lc - ic > unnamed_count:
                            hr -= 1
                            self.parser_start -= 1
                            this_header = [None] * lc

                data_line = hr + 1
                header.append(this_header)

            if self.names is not None:
                header = [ self.names ]

        elif self.names is not None:
            # Enforce this unless usecols
            if not self.has_usecols:
                self.parser.expected_fields = len(self.names)

            # Names passed
            if self.parser.lines < 1:
                self._tokenize_rows(1)

            header = [ self.names ]

            if self.parser.lines < 1:
                field_count = len(header[0])
            else:
                field_count = self.parser.line_fields[data_line]
        else:
            # No header passed nor to be found in the file
            if self.parser.lines < 1:
                self._tokenize_rows(1)

            return None, self.parser.line_fields[0], unnamed_cols

        # Corner case, not enough lines in the file
        if self.parser.lines < data_line + 1:
            field_count = len(header[0])
        else:  # not self.has_usecols:

            field_count = self.parser.line_fields[data_line]

            # #2981
            if self.names is not None:
                field_count = max(field_count, len(self.names))

            passed_count = len(header[0])

            if (self.has_usecols and self.allow_leading_cols and
                    not callable(self.usecols)):
                nuse = len(self.usecols)
                if nuse == passed_count:
                    self.leading_cols = 0
                elif self.names is None and nuse < passed_count:
                    self.leading_cols = field_count - passed_count
                elif passed_count != field_count:
                    raise ValueError('Passed header names '
                                     'mismatches usecols')
            # oh boy, #2442, #2981
            elif self.allow_leading_cols and passed_count < field_count:
                self.leading_cols = field_count - passed_count

        return header, field_count, unnamed_cols

    def read(self, rows=None):
        """
        rows=None --> read all rows
        """
        if self.low_memory:
            # Conserve intermediate space
            columns = self._read_low_memory(rows)
        else:
            # Don't care about memory usage
            columns = self._read_rows(rows, 1)

        return columns

    cdef _read_low_memory(self, rows):
        cdef:
            size_t rows_read = 0
            list chunks = []

        if rows is None:
            while True:
                try:
                    chunk = self._read_rows(self.buffer_lines, 0)
                    if len(chunk) == 0:
                        break
                except StopIteration:
                    break
                else:
                    chunks.append(chunk)
        else:
            while rows_read < rows:
                try:
                    crows = min(self.buffer_lines, rows - rows_read)

                    chunk = self._read_rows(crows, 0)
                    if len(chunk) == 0:
                        break

                    rows_read += len(list(chunk.values())[0])
                except StopIteration:
                    break
                else:
                    chunks.append(chunk)

        parser_trim_buffers(self.parser)

        if len(chunks) == 0:
            raise StopIteration

        # destructive to chunks
        return _concatenate_chunks(chunks)

    cdef _tokenize_rows(self, size_t nrows):
        cdef:
            int status

        with nogil:
            status = tokenize_nrows(self.parser, nrows)

        if self.parser.warn_msg != NULL:
            print(self.parser.warn_msg, file=sys.stderr)
            free(self.parser.warn_msg)
            self.parser.warn_msg = NULL

        if status < 0:
            raise_parser_error('Error tokenizing data', self.parser)

    cdef _read_rows(self, rows, bint trim):
        cdef:
            int64_t buffered_lines
            int64_t irows, footer = 0

        self._start_clock()

        if rows is not None:
            irows = rows
            buffered_lines = self.parser.lines - self.parser_start
            if buffered_lines < irows:
                self._tokenize_rows(irows - buffered_lines)

            if self.skipfooter > 0:
                raise ValueError('skipfooter can only be used to read '
                                 'the whole file')
        else:
            with nogil:
                status = tokenize_all_rows(self.parser)

            if self.parser.warn_msg != NULL:
                print(self.parser.warn_msg, file=sys.stderr)
                free(self.parser.warn_msg)
                self.parser.warn_msg = NULL

            if status < 0:
                raise_parser_error('Error tokenizing data', self.parser)
            footer = self.skipfooter

        if self.parser_start >= self.parser.lines:
            raise StopIteration
        self._end_clock('Tokenization')

        self._start_clock()
        columns = self._convert_column_data(rows=rows,
                                            footer=footer,
                                            upcast_na=True)
        self._end_clock('Type conversion')
        self._start_clock()
        if len(columns) > 0:
            rows_read = len(list(columns.values())[0])
            # trim
            parser_consume_rows(self.parser, rows_read)
            if trim:
                parser_trim_buffers(self.parser)
            self.parser_start -= rows_read

        self._end_clock('Parser memory cleanup')

        return columns

    cdef _start_clock(self):
        self.clocks.append(time.time())

    cdef _end_clock(self, what):
        if self.verbose:
            elapsed = time.time() - self.clocks.pop(-1)
            print(f'{what} took: {elapsed * 1000:.2f} ms')

    def set_noconvert(self, i):
        self.noconvert.add(i)

    def remove_noconvert(self, i):
        self.noconvert.remove(i)

    def _convert_column_data(self, rows=None, upcast_na=False, footer=0):
        cdef:
            int64_t i
            int nused
            kh_str_starts_t *na_hashset = NULL
            int64_t start, end
            object name, na_flist, col_dtype = None
            bint na_filter = 0
            int64_t num_cols

        start = self.parser_start

        if rows is None:
            end = self.parser.lines
        else:
            end = min(start + rows, self.parser.lines)

        # FIXME: dont leave commented-out
        # # skip footer
        # if footer > 0:
        #     end -= footer

        num_cols = -1
        # Py_ssize_t cast prevents build warning
        for i in range(<Py_ssize_t>self.parser.lines):
            num_cols = (num_cols < self.parser.line_fields[i]) * \
                self.parser.line_fields[i] + \
                (num_cols >= self.parser.line_fields[i]) * num_cols

        if self.table_width - self.leading_cols > num_cols:
            raise ParserError(f"Too many columns specified: expected "
                              f"{self.table_width - self.leading_cols} "
                              f"and found {num_cols}")

        results = {}
        nused = 0
        for i in range(self.table_width):
            if i < self.leading_cols:
                # Pass through leading columns always
                name = i
            elif (self.usecols and not callable(self.usecols) and
                    nused == len(self.usecols)):
                # Once we've gathered all requested columns, stop. GH5766
                break
            else:
                name = self._get_column_name(i, nused)
                usecols = set()
                if callable(self.usecols):
                    if self.usecols(name):
                        usecols = {i}
                else:
                    usecols = self.usecols
                if self.has_usecols and not (i in usecols or
                                             name in usecols):
                    continue
                nused += 1

            conv = self._get_converter(i, name)

            col_dtype = None
            if self.dtype is not None:
                if isinstance(self.dtype, dict):
                    if name in self.dtype:
                        col_dtype = self.dtype[name]
                    elif i in self.dtype:
                        col_dtype = self.dtype[i]
                else:
                    if self.dtype.names:
                        # structured array
                        col_dtype = np.dtype(self.dtype.descr[i][1])
                    else:
                        col_dtype = self.dtype

            if conv:
                if col_dtype is not None:
                    warnings.warn((f"Both a converter and dtype were specified "
                                   f"for column {name} - only the converter will "
                                   f"be used"), ParserWarning,
                                  stacklevel=5)
                results[i] = _apply_converter(conv, self.parser, i, start, end,
                                              self.c_encoding)
                continue

            # Collect the list of NaN values associated with the column.
            # If we aren't supposed to do that, or none are collected,
            # we set `na_filter` to `0` (`1` otherwise).
            na_flist = set()

            if self.na_filter:
                na_list, na_flist = self._get_na_list(i, name)
                if na_list is None:
                    na_filter = 0
                else:
                    na_filter = 1
                    na_hashset = kset_from_list(na_list)
            else:
                na_filter = 0

            # Attempt to parse tokens and infer dtype of the column.
            # Should return as the desired dtype (inferred or specified).
            try:
                col_res, na_count = self._convert_tokens(
                    i, start, end, name, na_filter, na_hashset,
                    na_flist, col_dtype)
            finally:
                # gh-21353
                #
                # Cleanup the NaN hash that we generated
                # to avoid memory leaks.
                if na_filter:
                    self._free_na_set(na_hashset)

            # don't try to upcast EAs
            try_upcast = upcast_na and na_count > 0
            if try_upcast and not is_extension_array_dtype(col_dtype):
                col_res = _maybe_upcast(col_res)

            if col_res is None:
                raise ParserError(f'Unable to parse column {i}')

            results[i] = col_res

        self.parser_start += end - start

        return results

    cdef inline _convert_tokens(self, Py_ssize_t i, int start, int end,
                                object name, bint na_filter,
                                kh_str_starts_t *na_hashset,
                                object na_flist, object col_dtype):

        if col_dtype is not None:
            col_res, na_count = self._convert_with_dtype(
                col_dtype, i, start, end, na_filter,
                1, na_hashset, na_flist)

            # Fallback on the parse (e.g. we requested int dtype,
            # but its actually a float).
            if col_res is not None:
                return col_res, na_count

        if i in self.noconvert:
            return self._string_convert(i, start, end, na_filter, na_hashset)
        else:
            col_res = None
            for dt in self.dtype_cast_order:
                try:
                    col_res, na_count = self._convert_with_dtype(
                        dt, i, start, end, na_filter, 0, na_hashset, na_flist)
                except ValueError:
                    # This error is raised from trying to convert to uint64,
                    # and we discover that we cannot convert to any numerical
                    # dtype successfully. As a result, we leave the data
                    # column AS IS with object dtype.
                    col_res, na_count = self._convert_with_dtype(
                        np.dtype('object'), i, start, end, 0,
                        0, na_hashset, na_flist)
                except OverflowError:
                    col_res, na_count = self._convert_with_dtype(
                        np.dtype('object'), i, start, end, na_filter,
                        0, na_hashset, na_flist)

                if col_res is not None:
                    break

        # we had a fallback parse on the dtype, so now try to cast
        # only allow safe casts, eg. with a nan you cannot safely cast to int
        if col_res is not None and col_dtype is not None:
            try:
                col_res = col_res.astype(col_dtype, casting='safe')
            except TypeError:

                # float -> int conversions can fail the above
                # even with no nans
                col_res_orig = col_res
                col_res = col_res.astype(col_dtype)
                if (col_res != col_res_orig).any():
                    raise ValueError(
                        f"cannot safely convert passed user dtype of "
                        f"{col_dtype} for {col_res_orig.dtype.name} dtyped data in "
                        f"column {i}")

        return col_res, na_count

    cdef _convert_with_dtype(self, object dtype, Py_ssize_t i,
                             int64_t start, int64_t end,
                             bint na_filter,
                             bint user_dtype,
                             kh_str_starts_t *na_hashset,
                             object na_flist):
        if is_categorical_dtype(dtype):
            # TODO: I suspect that _categorical_convert could be
            # optimized when dtype is an instance of CategoricalDtype
            codes, cats, na_count = _categorical_convert(
                self.parser, i, start, end, na_filter,
                na_hashset, self.c_encoding)

            # Method accepts list of strings, not encoded ones.
            true_values = [x.decode() for x in self.true_values]
            array_type = dtype.construct_array_type()
            cat = array_type._from_inferred_categories(
                cats, codes, dtype, true_values=true_values)
            return cat, na_count

        elif is_extension_array_dtype(dtype):
            result, na_count = self._string_convert(i, start, end, na_filter,
                                                    na_hashset)
            array_type = dtype.construct_array_type()
            try:
                # use _from_sequence_of_strings if the class defines it
                result = array_type._from_sequence_of_strings(result,
                                                              dtype=dtype)
            except NotImplementedError:
                raise NotImplementedError(
                    f"Extension Array: {array_type} must implement "
                    f"_from_sequence_of_strings in order "
                    f"to be used in parser methods")

            return result, na_count

        elif is_integer_dtype(dtype):
            try:
                result, na_count = _try_int64(self.parser, i, start,
                                              end, na_filter, na_hashset)
                if user_dtype and na_count is not None:
                    if na_count > 0:
                        raise ValueError(f"Integer column has NA values in column {i}")
            except OverflowError:
                result = _try_uint64(self.parser, i, start, end,
                                     na_filter, na_hashset)
                na_count = 0

            if result is not None and dtype != 'int64':
                result = result.astype(dtype)

            return result, na_count

        elif is_float_dtype(dtype):
            result, na_count = _try_double(self.parser, i, start, end,
                                           na_filter, na_hashset, na_flist)

            if result is not None and dtype != 'float64':
                result = result.astype(dtype)
            return result, na_count
        elif is_bool_dtype(dtype):
            result, na_count = _try_bool_flex(self.parser, i, start, end,
                                              na_filter, na_hashset,
                                              self.true_set, self.false_set)
            if user_dtype and na_count is not None:
                if na_count > 0:
                    raise ValueError(f"Bool column has NA values in column {i}")
            return result, na_count

        elif dtype.kind == 'S':
            # TODO: na handling
            width = dtype.itemsize
            if width > 0:
                result = _to_fw_string(self.parser, i, start, end, width)
                return result, 0

            # treat as a regular string parsing
            return self._string_convert(i, start, end, na_filter,
                                        na_hashset)
        elif dtype.kind == 'U':
            width = dtype.itemsize
            if width > 0:
                raise TypeError(f"the dtype {dtype} is not supported for parsing")

            # unicode variable width
            return self._string_convert(i, start, end, na_filter,
                                        na_hashset)
        elif is_object_dtype(dtype):
            return self._string_convert(i, start, end, na_filter,
                                        na_hashset)
        elif is_datetime64_dtype(dtype):
            raise TypeError(f"the dtype {dtype} is not supported "
                            f"for parsing, pass this column "
                            f"using parse_dates instead")
        else:
            raise TypeError(f"the dtype {dtype} is not supported for parsing")

    cdef _string_convert(self, Py_ssize_t i, int64_t start, int64_t end,
                         bint na_filter, kh_str_starts_t *na_hashset):

        cdef StringPath path = _string_path(self.c_encoding)

        if path == UTF8:
            return _string_box_utf8(self.parser, i, start, end, na_filter,
                                    na_hashset)
        elif path == ENCODED:
            return _string_box_decode(self.parser, i, start, end,
                                      na_filter, na_hashset, self.c_encoding)

    def _get_converter(self, i, name):
        if self.converters is None:
            return None

        if name is not None and name in self.converters:
            return self.converters[name]

        # Converter for position, if any
        return self.converters.get(i)

    cdef _get_na_list(self, i, name):
        if self.na_values is None:
            return None, set()

        if isinstance(self.na_values, dict):
            key = None
            values = None

            if name is not None and name in self.na_values:
                key = name
            elif i in self.na_values:
                key = i
            else:  # No na_values provided for this column.
                if self.keep_default_na:
                    return _NA_VALUES, set()

                return list(), set()

            values = self.na_values[key]
            if values is not None and not isinstance(values, list):
                values = list(values)

            fvalues = self.na_fvalues[key]
            if fvalues is not None and not isinstance(fvalues, set):
                fvalues = set(fvalues)

            return _ensure_encoded(values), fvalues
        else:
            if not isinstance(self.na_values, list):
                self.na_values = list(self.na_values)
            if not isinstance(self.na_fvalues, set):
                self.na_fvalues = set(self.na_fvalues)

            return _ensure_encoded(self.na_values), self.na_fvalues

    cdef _free_na_set(self, kh_str_starts_t *table):
        kh_destroy_str_starts(table)

    cdef _get_column_name(self, Py_ssize_t i, Py_ssize_t nused):
        cdef int64_t j
        if self.has_usecols and self.names is not None:
            if (not callable(self.usecols) and
                    len(self.names) == len(self.usecols)):
                return self.names[nused]
            else:
                return self.names[i - self.leading_cols]
        else:
            if self.header is not None:
                j = i - self.leading_cols
                # generate extra (bogus) headers if there are more columns than headers
                if j >= len(self.header[0]):
                    return j
                else:
                    return self.header[0][j]
            else:
                return None


cdef:
    object _true_values = [b'True', b'TRUE', b'true']
    object _false_values = [b'False', b'FALSE', b'false']


def _ensure_encoded(list lst):
    cdef:
        list result = []
    for x in lst:
        if isinstance(x, str):
            x = PyUnicode_AsUTF8String(x)
        elif not isinstance(x, bytes):
            x = str(x).encode('utf-8')

        result.append(x)
    return result


# common NA values
# no longer excluding inf representations
# '1.#INF','-1.#INF', '1.#INF000000',
STR_NA_VALUES = {
    "-1.#IND",
    "1.#QNAN",
    "1.#IND",
    "-1.#QNAN",
    "#N/A N/A",
    "#N/A",
    "N/A",
    "n/a",
    "NA",
    "<NA>",
    "#NA",
    "NULL",
    "null",
    "NaN",
    "-NaN",
    "nan",
    "-nan",
    "",
}
_NA_VALUES = _ensure_encoded(list(STR_NA_VALUES))


def _maybe_upcast(arr):
    """

    """
    if issubclass(arr.dtype.type, np.integer):
        na_value = na_values[arr.dtype]
        arr = arr.astype(float)
        np.putmask(arr, arr == na_value, np.nan)
    elif arr.dtype == np.bool_:
        mask = arr.view(np.uint8) == na_values[np.uint8]
        arr = arr.astype(object)
        np.putmask(arr, mask, np.nan)

    return arr


cdef enum StringPath:
    UTF8
    ENCODED


# factored out logic to pick string converter
cdef inline StringPath _string_path(char *encoding):
    if encoding != NULL and encoding != b"utf-8":
        return ENCODED
    return UTF8


# ----------------------------------------------------------------------
# Type conversions / inference support code


cdef _string_box_utf8(parser_t *parser, int64_t col,
                      int64_t line_start, int64_t line_end,
                      bint na_filter, kh_str_starts_t *na_hashset):
    cdef:
        int error, na_count = 0
        Py_ssize_t i, lines
        coliter_t it
        const char *word = NULL
        ndarray[object] result

        int ret = 0
        kh_strbox_t *table

        object pyval

        object NA = na_values[np.object_]
        khiter_t k

    table = kh_init_strbox()
    lines = line_end - line_start
    result = np.empty(lines, dtype=np.object_)
    coliter_setup(&it, parser, col, line_start)

    for i in range(lines):
        COLITER_NEXT(it, word)

        if na_filter:
            if kh_get_str_starts_item(na_hashset, word):
                # in the hash table
                na_count += 1
                result[i] = NA
                continue

        k = kh_get_strbox(table, word)

        # in the hash table
        if k != table.n_buckets:
            # this increments the refcount, but need to test
            pyval = <object>table.vals[k]
        else:
            # box it. new ref?
            pyval = PyUnicode_FromString(word)

            k = kh_put_strbox(table, word, &ret)
            table.vals[k] = <PyObject *>pyval

        result[i] = pyval

    kh_destroy_strbox(table)

    return result, na_count


cdef _string_box_decode(parser_t *parser, int64_t col,
                        int64_t line_start, int64_t line_end,
                        bint na_filter, kh_str_starts_t *na_hashset,
                        char *encoding):
    cdef:
        int na_count = 0
        Py_ssize_t i, size, lines
        coliter_t it
        const char *word = NULL
        ndarray[object] result

        int ret = 0
        kh_strbox_t *table

        char *errors = "strict"

        object pyval

        object NA = na_values[np.object_]
        khiter_t k

    table = kh_init_strbox()
    lines = line_end - line_start
    result = np.empty(lines, dtype=np.object_)
    coliter_setup(&it, parser, col, line_start)

    for i in range(lines):
        COLITER_NEXT(it, word)

        if na_filter:
            if kh_get_str_starts_item(na_hashset, word):
            # in the hash table
                na_count += 1
                result[i] = NA
                continue

        k = kh_get_strbox(table, word)

        # in the hash table
        if k != table.n_buckets:
            # this increments the refcount, but need to test
            pyval = <object>table.vals[k]
        else:
            # box it. new ref?
            size = strlen(word)
            pyval = PyUnicode_Decode(word, size, encoding, errors)

            k = kh_put_strbox(table, word, &ret)
            table.vals[k] = <PyObject *>pyval

        result[i] = pyval

    kh_destroy_strbox(table)

    return result, na_count


@cython.boundscheck(False)
cdef _categorical_convert(parser_t *parser, int64_t col,
                          int64_t line_start, int64_t line_end,
                          bint na_filter, kh_str_starts_t *na_hashset,
                          char *encoding):
    "Convert column data into codes, categories"
    cdef:
        int na_count = 0
        Py_ssize_t i, size, lines
        coliter_t it
        const char *word = NULL

        int64_t NA = -1
        int64_t[:] codes
        int64_t current_category = 0

        char *errors = "strict"
        StringPath path = _string_path(encoding)

        int ret = 0
        kh_str_t *table
        khiter_t k

    lines = line_end - line_start
    codes = np.empty(lines, dtype=np.int64)

    # factorize parsed values, creating a hash table
    # bytes -> category code
    with nogil:
        table = kh_init_str()
        coliter_setup(&it, parser, col, line_start)

        for i in range(lines):
            COLITER_NEXT(it, word)

            if na_filter:
                if kh_get_str_starts_item(na_hashset, word):
                # is in NA values
                    na_count += 1
                    codes[i] = NA
                    continue

            k = kh_get_str(table, word)
            # not in the hash table
            if k == table.n_buckets:
                k = kh_put_str(table, word, &ret)
                table.vals[k] = current_category
                current_category += 1

            codes[i] = table.vals[k]

    # parse and box categories to python strings
    result = np.empty(table.n_occupied, dtype=np.object_)
    if path == ENCODED:
        for k in range(table.n_buckets):
            if kh_exist_str(table, k):
                size = strlen(table.keys[k])
                result[table.vals[k]] = PyUnicode_Decode(
                    table.keys[k], size, encoding, errors)
    elif path == UTF8:
        for k in range(table.n_buckets):
            if kh_exist_str(table, k):
                result[table.vals[k]] = PyUnicode_FromString(table.keys[k])

    kh_destroy_str(table)
    return np.asarray(codes), result, na_count


cdef _to_fw_string(parser_t *parser, int64_t col, int64_t line_start,
                   int64_t line_end, int64_t width):
    cdef:
        char *data
        ndarray result

    result = np.empty(line_end - line_start, dtype=f'|S{width}')
    data = <char*>result.data

    with nogil:
        _to_fw_string_nogil(parser, col, line_start, line_end, width, data)

    return result


cdef inline void _to_fw_string_nogil(parser_t *parser, int64_t col,
                                     int64_t line_start, int64_t line_end,
                                     size_t width, char *data) nogil:
    cdef:
        int64_t i
        coliter_t it
        const char *word = NULL

    coliter_setup(&it, parser, col, line_start)

    for i in range(line_end - line_start):
        COLITER_NEXT(it, word)
        strncpy(data, word, width)
        data += width


cdef:
    char* cinf = b'inf'
    char* cposinf = b'+inf'
    char* cneginf = b'-inf'

    char* cinfty = b'Infinity'
    char* cposinfty = b'+Infinity'
    char* cneginfty = b'-Infinity'


cdef _try_double(parser_t *parser, int64_t col,
                 int64_t line_start, int64_t line_end,
                 bint na_filter, kh_str_starts_t *na_hashset, object na_flist):
    cdef:
        int error, na_count = 0
        Py_ssize_t lines
        float64_t *data
        float64_t NA = na_values[np.float64]
        kh_float64_t *na_fset
        ndarray result
        bint use_na_flist = len(na_flist) > 0

    lines = line_end - line_start
    result = np.empty(lines, dtype=np.float64)
    data = <float64_t *>result.data
    na_fset = kset_float64_from_list(na_flist)
    with nogil:
        error = _try_double_nogil(parser, parser.double_converter,
                                  col, line_start, line_end,
                                  na_filter, na_hashset, use_na_flist,
                                  na_fset, NA, data, &na_count)

    kh_destroy_float64(na_fset)
    if error != 0:
        return None, None
    return result, na_count


cdef inline int _try_double_nogil(parser_t *parser,
                                  float64_t (*double_converter)(
                                      const char *, char **, char,
                                      char, char, int, int *, int *) nogil,
                                  int col, int line_start, int line_end,
                                  bint na_filter, kh_str_starts_t *na_hashset,
                                  bint use_na_flist,
                                  const kh_float64_t *na_flist,
                                  float64_t NA, float64_t *data,
                                  int *na_count) nogil:
    cdef:
        int error = 0,
        Py_ssize_t i, lines = line_end - line_start
        coliter_t it
        const char *word = NULL
        char *p_end
        khiter_t k64

    na_count[0] = 0
    coliter_setup(&it, parser, col, line_start)

    if na_filter:
        for i in range(lines):
            COLITER_NEXT(it, word)

            if kh_get_str_starts_item(na_hashset, word):
                # in the hash table
                na_count[0] += 1
                data[0] = NA
            else:
                data[0] = double_converter(word, &p_end, parser.decimal,
                                           parser.sci, parser.thousands,
                                           1, &error, NULL)
                if error != 0 or p_end == word or p_end[0]:
                    error = 0
                    if (strcasecmp(word, cinf) == 0 or
                            strcasecmp(word, cposinf) == 0 or
                            strcasecmp(word, cinfty) == 0 or
                            strcasecmp(word, cposinfty) == 0):
                        data[0] = INF
                    elif (strcasecmp(word, cneginf) == 0 or
                            strcasecmp(word, cneginfty) == 0 ):
                        data[0] = NEGINF
                    else:
                        return 1
                if use_na_flist:
                    k64 = kh_get_float64(na_flist, data[0])
                    if k64 != na_flist.n_buckets:
                        na_count[0] += 1
                        data[0] = NA
            data += 1
    else:
        for i in range(lines):
            COLITER_NEXT(it, word)
            data[0] = double_converter(word, &p_end, parser.decimal,
                                       parser.sci, parser.thousands,
                                       1, &error, NULL)
            if error != 0 or p_end == word or p_end[0]:
                error = 0
                if (strcasecmp(word, cinf) == 0 or
                        strcasecmp(word, cposinf) == 0 or
                        strcasecmp(word, cinfty) == 0 or
                        strcasecmp(word, cposinfty) == 0):
                    data[0] = INF
                elif (strcasecmp(word, cneginf) == 0 or
                        strcasecmp(word, cneginfty) == 0):
                    data[0] = NEGINF
                else:
                    return 1
            data += 1

    return 0


cdef _try_uint64(parser_t *parser, int64_t col,
                 int64_t line_start, int64_t line_end,
                 bint na_filter, kh_str_starts_t *na_hashset):
    cdef:
        int error
        Py_ssize_t lines
        coliter_t it
        uint64_t *data
        ndarray result
        uint_state state

    lines = line_end - line_start
    result = np.empty(lines, dtype=np.uint64)
    data = <uint64_t *>result.data

    uint_state_init(&state)
    coliter_setup(&it, parser, col, line_start)
    with nogil:
        error = _try_uint64_nogil(parser, col, line_start, line_end,
                                  na_filter, na_hashset, data, &state)
    if error != 0:
        if error == ERROR_OVERFLOW:
            # Can't get the word variable
            raise OverflowError('Overflow')
        return None

    if uint64_conflict(&state):
        raise ValueError('Cannot convert to numerical dtype')

    if state.seen_sint:
        raise OverflowError('Overflow')

    return result


cdef inline int _try_uint64_nogil(parser_t *parser, int64_t col,
                                  int64_t line_start,
                                  int64_t line_end, bint na_filter,
                                  const kh_str_starts_t *na_hashset,
                                  uint64_t *data, uint_state *state) nogil:
    cdef:
        int error
        Py_ssize_t i, lines = line_end - line_start
        coliter_t it
        const char *word = NULL

    coliter_setup(&it, parser, col, line_start)

    if na_filter:
        for i in range(lines):
            COLITER_NEXT(it, word)
            if kh_get_str_starts_item(na_hashset, word):
                # in the hash table
                state.seen_null = 1
                data[i] = 0
                continue

            data[i] = str_to_uint64(state, word, INT64_MAX, UINT64_MAX,
                                    &error, parser.thousands)
            if error != 0:
                return error
    else:
        for i in range(lines):
            COLITER_NEXT(it, word)
            data[i] = str_to_uint64(state, word, INT64_MAX, UINT64_MAX,
                                    &error, parser.thousands)
            if error != 0:
                return error

    return 0


cdef _try_int64(parser_t *parser, int64_t col,
                int64_t line_start, int64_t line_end,
                bint na_filter, kh_str_starts_t *na_hashset):
    cdef:
        int error, na_count = 0
        Py_ssize_t lines
        coliter_t it
        int64_t *data
        ndarray result
        int64_t NA = na_values[np.int64]

    lines = line_end - line_start
    result = np.empty(lines, dtype=np.int64)
    data = <int64_t *>result.data
    coliter_setup(&it, parser, col, line_start)
    with nogil:
        error = _try_int64_nogil(parser, col, line_start, line_end,
                                 na_filter, na_hashset, NA, data, &na_count)
    if error != 0:
        if error == ERROR_OVERFLOW:
            # Can't get the word variable
            raise OverflowError('Overflow')
        return None, None

    return result, na_count


cdef inline int _try_int64_nogil(parser_t *parser, int64_t col,
                                 int64_t line_start,
                                 int64_t line_end, bint na_filter,
                                 const kh_str_starts_t *na_hashset, int64_t NA,
                                 int64_t *data, int *na_count) nogil:
    cdef:
        int error
        Py_ssize_t i, lines = line_end - line_start
        coliter_t it
        const char *word = NULL

    na_count[0] = 0
    coliter_setup(&it, parser, col, line_start)

    if na_filter:
        for i in range(lines):
            COLITER_NEXT(it, word)
            if kh_get_str_starts_item(na_hashset, word):
                # in the hash table
                na_count[0] += 1
                data[i] = NA
                continue

            data[i] = str_to_int64(word, INT64_MIN, INT64_MAX,
                                   &error, parser.thousands)
            if error != 0:
                return error
    else:
        for i in range(lines):
            COLITER_NEXT(it, word)
            data[i] = str_to_int64(word, INT64_MIN, INT64_MAX,
                                   &error, parser.thousands)
            if error != 0:
                return error

    return 0


cdef _try_bool_flex(parser_t *parser, int64_t col,
                    int64_t line_start, int64_t line_end,
                    bint na_filter, const kh_str_starts_t *na_hashset,
                    const kh_str_starts_t *true_hashset,
                    const kh_str_starts_t *false_hashset):
    cdef:
        int error, na_count = 0
        Py_ssize_t lines
        uint8_t *data
        ndarray result
        uint8_t NA = na_values[np.bool_]

    lines = line_end - line_start
    result = np.empty(lines, dtype=np.uint8)
    data = <uint8_t *>result.data
    with nogil:
        error = _try_bool_flex_nogil(parser, col, line_start, line_end,
                                     na_filter, na_hashset, true_hashset,
                                     false_hashset, NA, data, &na_count)
    if error != 0:
        return None, None
    return result.view(np.bool_), na_count


cdef inline int _try_bool_flex_nogil(parser_t *parser, int64_t col,
                                     int64_t line_start,
                                     int64_t line_end, bint na_filter,
                                     const kh_str_starts_t *na_hashset,
                                     const kh_str_starts_t *true_hashset,
                                     const kh_str_starts_t *false_hashset,
                                     uint8_t NA, uint8_t *data,
                                     int *na_count) nogil:
    cdef:
        int error = 0
        Py_ssize_t i, lines = line_end - line_start
        coliter_t it
        const char *word = NULL

    na_count[0] = 0
    coliter_setup(&it, parser, col, line_start)

    if na_filter:
        for i in range(lines):
            COLITER_NEXT(it, word)

            if kh_get_str_starts_item(na_hashset, word):
                # in the hash table
                na_count[0] += 1
                data[0] = NA
                data += 1
                continue

            if kh_get_str_starts_item(true_hashset, word):
                data[0] = 1
                data += 1
                continue
            if kh_get_str_starts_item(false_hashset, word):
                data[0] = 0
                data += 1
                continue

            error = to_boolean(word, data)
            if error != 0:
                return error
            data += 1
    else:
        for i in range(lines):
            COLITER_NEXT(it, word)

            if kh_get_str_starts_item(true_hashset, word):
                data[0] = 1
                data += 1
                continue

            if kh_get_str_starts_item(false_hashset, word):
                data[0] = 0
                data += 1
                continue

            error = to_boolean(word, data)
            if error != 0:
                return error
            data += 1

    return 0


cdef kh_str_starts_t* kset_from_list(list values) except NULL:
    # caller takes responsibility for freeing the hash table
    cdef:
        Py_ssize_t i
        kh_str_starts_t *table
        int ret = 0
        object val

    table = kh_init_str_starts()

    for i in range(len(values)):
        val = values[i]

        # None creeps in sometimes, which isn't possible here
        if not isinstance(val, bytes):
            kh_destroy_str_starts(table)
            raise ValueError('Must be all encoded bytes')

        kh_put_str_starts_item(table, PyBytes_AsString(val), &ret)

    if table.table.n_buckets <= 128:
        # Resize the hash table to make it almost empty, this
        # reduces amount of hash collisions on lookup thus
        # "key not in table" case is faster.
        # Note that this trades table memory footprint for lookup speed.
        kh_resize_str_starts(table, table.table.n_buckets * 8)

    return table


cdef kh_float64_t* kset_float64_from_list(values) except NULL:
    # caller takes responsibility for freeing the hash table
    cdef:
        khiter_t k
        kh_float64_t *table
        int ret = 0
        float64_t val
        object value

    table = kh_init_float64()

    for value in values:
        val = float(value)

        k = kh_put_float64(table, val, &ret)

    if table.n_buckets <= 128:
        # See reasoning in kset_from_list
        kh_resize_float64(table, table.n_buckets * 8)
    return table


cdef raise_parser_error(object base, parser_t *parser):
    cdef:
        object old_exc
        object exc_type
        PyObject *type
        PyObject *value
        PyObject *traceback

    if PyErr_Occurred():
        PyErr_Fetch(&type, &value, &traceback)
        Py_XDECREF(traceback)

        if value != NULL:
            old_exc = <object>value
            Py_XDECREF(value)

            # PyErr_Fetch only returned the error message in *value,
            # so the Exception class must be extracted from *type.
            if isinstance(old_exc, str):
                if type != NULL:
                    exc_type = <object>type
                else:
                    exc_type = ParserError

                Py_XDECREF(type)
                raise exc_type(old_exc)
            else:
                Py_XDECREF(type)
                raise old_exc

    message = f'{base}. C error: '
    if parser.error_msg != NULL:
        message += parser.error_msg.decode('utf-8')
    else:
        message += 'no error message set'

    raise ParserError(message)


def _concatenate_chunks(list chunks):
    cdef:
        list names = list(chunks[0].keys())
        object name
        list warning_columns = []
        object warning_names
        object common_type

    result = {}
    for name in names:
        arrs = [chunk.pop(name) for chunk in chunks]
        # Check each arr for consistent types.
        dtypes = {a.dtype for a in arrs}
        numpy_dtypes = {x for x in dtypes if not is_categorical_dtype(x)}
        if len(numpy_dtypes) > 1:
            common_type = np.find_common_type(numpy_dtypes, [])
            if common_type == object:
                warning_columns.append(str(name))

        dtype = dtypes.pop()
        if is_categorical_dtype(dtype):
            sort_categories = isinstance(dtype, str)
            result[name] = union_categoricals(arrs,
                                              sort_categories=sort_categories)
        else:
            if is_extension_array_dtype(dtype):
                array_type = dtype.construct_array_type()
                result[name] = array_type._concat_same_type(arrs)
            else:
                result[name] = np.concatenate(arrs)

    if warning_columns:
        warning_names = ','.join(warning_columns)
        warning_message = " ".join([
            f"Columns ({warning_names}) have mixed types."
            f"Specify dtype option on import or set low_memory=False."
          ])
        warnings.warn(warning_message, DtypeWarning, stacklevel=8)
    return result


# ----------------------------------------------------------------------
# NA values
def _compute_na_values():
    int64info = np.iinfo(np.int64)
    int32info = np.iinfo(np.int32)
    int16info = np.iinfo(np.int16)
    int8info = np.iinfo(np.int8)
    uint64info = np.iinfo(np.uint64)
    uint32info = np.iinfo(np.uint32)
    uint16info = np.iinfo(np.uint16)
    uint8info = np.iinfo(np.uint8)
    na_values = {
        np.float64: np.nan,
        np.int64: int64info.min,
        np.int32: int32info.min,
        np.int16: int16info.min,
        np.int8: int8info.min,
        np.uint64: uint64info.max,
        np.uint32: uint32info.max,
        np.uint16: uint16info.max,
        np.uint8: uint8info.max,
        np.bool_: uint8info.max,
        np.object_: np.nan   # oof
    }
    return na_values


na_values = _compute_na_values()

for k in list(na_values):
    na_values[np.dtype(k)] = na_values[k]


cdef _apply_converter(object f, parser_t *parser, int64_t col,
                      int64_t line_start, int64_t line_end,
                      char* c_encoding):
    cdef:
        Py_ssize_t i, lines
        coliter_t it
        const char *word = NULL
        char *errors = "strict"
        ndarray[object] result
        object val

    lines = line_end - line_start
    result = np.empty(lines, dtype=np.object_)

    coliter_setup(&it, parser, col, line_start)

    if c_encoding == NULL or c_encoding == b'utf-8':
        for i in range(lines):
            COLITER_NEXT(it, word)
            val = PyUnicode_FromString(word)
            result[i] = f(val)
    else:
        for i in range(lines):
            COLITER_NEXT(it, word)
            val = PyUnicode_Decode(word, strlen(word),
                                   c_encoding, errors)
            result[i] = f(val)

    return lib.maybe_convert_objects(result)


def _maybe_encode(values):
    if values is None:
        return []
    return [x.encode('utf-8') if isinstance(x, str) else x for x in values]


def sanitize_objects(ndarray[object] values, set na_values,
                     bint convert_empty=True):
    """
    Convert specified values, including the given set na_values and empty
    strings if convert_empty is True, to np.nan.

    Parameters
    ----------
    values : ndarray[object]
    na_values : set
    convert_empty : bool, default True
    """
    cdef:
        Py_ssize_t i, n
        object val, onan
        Py_ssize_t na_count = 0
        dict memo = {}

    n = len(values)
    onan = np.nan

    for i in range(n):
        val = values[i]
        if (convert_empty and val == '') or (val in na_values):
            values[i] = onan
            na_count += 1
        elif val in memo:
            values[i] = memo[val]
        else:
            memo[val] = val

    return na_count
